{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# x = \"\"\"\n",
    "# translation/malay-english/translated-0.json\n",
    "# translation/malay-english/translated-100000.json\n",
    "# translation/malay-english/translated-200000.json\n",
    "# translation/malay-english/translated-300000.json\n",
    "# translation/malay-english/translated-400000.json\n",
    "# translation/malay-english/translated-500000.json\n",
    "# translation/malay-english/translated-600000.json\n",
    "# translation/malay-english/translated-700000.json\n",
    "# translation/malay-english/translated-800000.json\n",
    "# translation/malay-english/translated-900000.json\n",
    "# translation/malay-english/translated-1000000.json\n",
    "# translation/malay-english/translated-1100000.json\n",
    "# translation/malay-english/translated-1200000.json\n",
    "# translation/malay-english/translated-1300000.json\n",
    "# translation/malay-english/translated-1400000.json\n",
    "# translation/malay-english/translated-1500000.json\n",
    "# translation/malay-english/translated-1600000.json\n",
    "# translation/malay-english/translated-1700000.json\n",
    "# translation/malay-english/translated-1800000.json\n",
    "# translation/malay-english/translated-1900000.json\n",
    "# translation/malay-english/translated-2000000.json\n",
    "# translation/malay-english/translated-2100000.json\n",
    "# translation/malay-english/translated-2200000.json\n",
    "# translation/malay-english/translated-2300000.json\n",
    "# translation/malay-english/translated-2400000.json\n",
    "# translation/malay-english/translated-2500000.json\n",
    "# translation/malay-english/translated-2600000.json\n",
    "# translation/malay-english/translated-2700000.json\n",
    "# translation/malay-english/translated-2800000.json\n",
    "# translation/malay-english/translated-2900000.json\n",
    "# translation/malay-english/translated-3000000.json\n",
    "# translation/malay-english/translated-3100000.json\n",
    "# translation/malay-english/translated-3200000.json\n",
    "# translation/malay-english/translated-3300000.json\n",
    "# translation/malay-english/translated-3400000.json\n",
    "# translation/malay-english/translated-3500000.json\n",
    "# \"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import os\n",
    "# url = 'https://f000.backblazeb2.com/file/malay-dataset/'\n",
    "\n",
    "# for row in x.split('\\n'):\n",
    "#     if not len(row):\n",
    "#         continue\n",
    "#     link = f'{url}{row}'\n",
    "#     os.system(f'wget {link}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/translation/parliament/translated-trainset-parliament.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/translation/opus/gnome-ms-en.json X\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/translation/opus/kde4-ms-en.json X\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/translation/opus/opensubtitle-ms-en.json X\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/translation/opus/qed-ms-en.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/translation/opus/tanzil-ms-en.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/translation/opus/ubuntu-ms-en.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('translated-trainset-parliament.json') as fopen:\n",
    "    data = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "rejected = ['PERTANYAAN-PERTANYAAN JAWAB LISAN', 'PENGGAL KEEMPAT', 'PUSAT JAGAAN BERDAFTAR',\n",
    "           'BILANGAN PUSAT JAGAAN', 'pewan']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "50460\n"
     ]
    }
   ],
   "source": [
    "selected, reject = [], []\n",
    "for row in data:\n",
    "    if any([r.lower() in row[0].lower() for r in rejected]):\n",
    "        reject.append(row)\n",
    "        continue\n",
    "    s = row[0]\n",
    "    if (sum(c.isdigit() for c in s) / len(s)) > 0.15:\n",
    "        reject.append(row)\n",
    "        continue\n",
    "    if sum(c.isalpha() for c in s) == 0:\n",
    "        reject.append(row)\n",
    "        continue\n",
    "    selected.append(row)\n",
    "    \n",
    "print(len(selected))\n",
    "x_parliament, y_parliament = list(zip(*selected))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "with open('ubuntu-ms-en.json') as fopen:\n",
    "    data = json.load(fopen)\n",
    "    \n",
    "X, Y = list(zip(*data))\n",
    "X = list(X)\n",
    "Y = list(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('qed-ms-en.json') as fopen:\n",
    "    data = json.load(fopen)\n",
    "    \n",
    "x, y = list(zip(*data))\n",
    "X.extend(x)\n",
    "Y.extend(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('tanzil-ms-en.json') as fopen:\n",
    "    data = json.load(fopen)\n",
    "    \n",
    "x, y = list(zip(*data))\n",
    "X.extend(x)\n",
    "Y.extend(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['translated-3200000.json',\n",
       " 'translated-700000.json',\n",
       " 'translated-2100000.json',\n",
       " 'translated-3300000.json',\n",
       " 'translated-2300000.json',\n",
       " 'translated-2000000.json',\n",
       " 'translated-600000.json',\n",
       " 'translated-900000.json',\n",
       " 'translated-1000000.json',\n",
       " 'translated-1100000.json',\n",
       " 'translated-1900000.json',\n",
       " 'translated-500000.json',\n",
       " 'translated-1400000.json',\n",
       " 'translated-1500000.json',\n",
       " 'translated-2600000.json',\n",
       " 'translated-200000.json',\n",
       " 'translated-2900000.json',\n",
       " 'translated-3400000.json',\n",
       " 'translated-3500000.json',\n",
       " 'translated-2800000.json',\n",
       " 'translated-300000.json',\n",
       " 'translated-2500000.json',\n",
       " 'translated-3100000.json',\n",
       " 'translated-1300000.json',\n",
       " 'translated-2400000.json',\n",
       " 'translated-100000.json',\n",
       " 'translated-1600000.json',\n",
       " 'translated-2700000.json',\n",
       " 'translated-0.json',\n",
       " 'translated-800000.json',\n",
       " 'translated-1800000.json',\n",
       " 'translated-2200000.json',\n",
       " 'translated-1200000.json',\n",
       " 'translated-1700000.json',\n",
       " 'translated-3000000.json',\n",
       " 'translated-400000.json']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "translated = glob('translated*0.json')\n",
    "translated"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "for file in translated:\n",
    "    with open(file) as fopen:\n",
    "        data = json.load(fopen)\n",
    "\n",
    "    x, y = list(zip(*data))\n",
    "    X.extend(x)\n",
    "    Y.extend(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import os\n",
    "\n",
    "# prefix = 'https://f000.backblazeb2.com/file/malay-dataset/'\n",
    "# urls = \"\"\"\n",
    "# wiki-wizard/dialogs.translate\n",
    "# wiki-wizard/informations-0.json.translate\n",
    "# wiki-wizard/informations-100000.json.translate\n",
    "# wiki-wizard/informations-200000.json.translate\n",
    "# chatbot/convai2/convai2-0.json.translate\n",
    "# chatbot/convai2/convai2-100000.json.translate\n",
    "# paraphrase/funpedia/rephrase.json.translate\n",
    "# \"\"\"\n",
    "# for url in urls.split('\\n'):\n",
    "#     if len(url):\n",
    "#         print(url)\n",
    "#         os.system(f'wget {prefix}{url}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "from unidecode import unidecode\n",
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    string = unidecode(string).replace('\\n', ' ').replace('\\t', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string\n",
    "\n",
    "def check(string):\n",
    "    string = re.sub('[^A-Za-z\\- ]+', ' ', string)\n",
    "    string = re.sub(r'[ ]+', ' ', string.lower()).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "informations-100000.json.translate 1.982215263559356\n",
      "dialogs.translate 2.4126386389843635\n",
      "informations-200000.json.translate 1.6322194321035348\n",
      "informations-0.json.translate 2.2543461041397426\n",
      "convai2-100000.json.translate 1.8807818751752086\n",
      "convai2-0.json.translate 2.149234888669429\n"
     ]
    }
   ],
   "source": [
    "translated = glob('*.translate')\n",
    "\n",
    "x, y = [], []\n",
    "\n",
    "rejected = ['lexus', 'little', 'lizards', 'lizard']\n",
    "\n",
    "for t in translated:\n",
    "    \n",
    "    if 'rephrase.json' in t:\n",
    "        continue\n",
    "        \n",
    "    with open(t) as fopen:\n",
    "        data = json.load(fopen)\n",
    "        \n",
    "    count = 0\n",
    "    for no, row in enumerate(data):\n",
    "        splitted = row[0]['text'].split('<>')\n",
    "        splitted_bm = row[1].split('<>')\n",
    "        if len(splitted) != len(splitted_bm):\n",
    "            count += 1\n",
    "            continue\n",
    "        \n",
    "        for k in range(len(splitted)):\n",
    "            s = check(splitted[k])\n",
    "            if any([r in s for r in rejected]):\n",
    "                continue\n",
    "            y.append(splitted[k])\n",
    "            x.append(splitted_bm[k])\n",
    "        \n",
    "    print(t, count / len(data) * 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1406877"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(' saya mempunyai rambut coklat. ', ' i have brown hair. '),\n",
       " (' Istilah \"si rambut coklat\" adalah bentuk feminin dari perkataan Perancis \"brunet\", yang merupakan bentuk kecil dari \"brun\" yang bermaksud \"berambut coklat / berambut perang\", yang femininnya adalah \"brune\".',\n",
       "  ' The term \"brunette\" is the feminine form of the French word \"brunet\", which is a diminutive form of \"brun\" meaning \"brown/brown-haired\", the feminine of which is \"brune\".'),\n",
       " ('Rambut coklat ', 'Brown hair '),\n",
       " (' saya mempunyai rambut coklat. ', ' i have brown hair. '),\n",
       " (' Bentuk \"brun\" (diucapkan) masih sering digunakan di Skotlandia, terutama di luar bandar, dan juga kata untuk \"coklat\" dalam bahasa Skandinavia.',\n",
       "  ' The form \"brun\" (pronounced ) is still commonly used in Scotland, particularly in rural areas, and is also the word for \"brown\" in the Scandinavian languages.'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mempunyai rambut ungu. ', ' i have purple hair. '),\n",
       " (' Sebab utama untuk ini adalah kosmetik: untuk menutup rambut beruban atau putih, menukar warna yang dianggap lebih bergaya atau diinginkan, untuk mengembalikan warna rambut asli setelah ia berubah warna oleh proses pendandan rambut atau pemutihan sinar matahari.',\n",
       "  ' The main reasons for this are cosmetic: to cover gray or white hair, to change to a color regarded as more fashionable or desirable, to restore the original hair color after it has been discolored by hairdressing processes or sun bleaching.'),\n",
       " ('Rambut coklat ', 'Brown hair '),\n",
       " (' saya mempunyai rambut coklat. ', ' i have brown hair. '),\n",
       " (' Semua istilah ini akhirnya berasal dari akar Proto-Indo-Eropah * \"bhrūn-\" \"coklat, kelabu\".',\n",
       "  ' All of these terms ultimately derive from the Proto-Indo-European root *\"bhrūn-\" \"brown, grey\".'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mempunyai rambut ungu. ', ' i have purple hair. '),\n",
       " (' Pewarnaan rambut, atau pewarnaan rambut, adalah amalan menukar warna rambut.',\n",
       "  ' Hair coloring, or hair dyeing, is the practice of changing the hair color.'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mempunyai rambut ungu. ', ' i have purple hair. '),\n",
       " (' Hari ini, pewarnaan rambut sangat popular, dengan 75% wanita dan 18% lelaki yang tinggal di Copenhagen telah melaporkan menggunakan pewarna rambut menurut kajian oleh University of Copenhagen.',\n",
       "  ' Today, hair coloring is very popular, with 75% of women and 18% of men living in Copenhagen having reported using hair dye according to a study by the University of Copenhagen.'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mempunyai rambut ungu. ', ' i have purple hair. '),\n",
       " (' Pewarnaan di rumah di Amerika Syarikat mencapai $ 1.9 bilion pada tahun 2011 dan dijangka meningkat menjadi $ 2.2 bilion pada tahun 2016.',\n",
       "  ' At home coloring in the United States reached $1.9 billion in 2011 and is expected to raise to $2.2 billion by 2016.'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mempunyai rambut ungu. ', ' i have purple hair. '),\n",
       " (' Sebilangan dari mereka bercukur bersih, tetapi yang lain - terutama yang berpangkat tinggi, mencukur pipinya tetapi meninggalkan misai yang menutupi seluruh mulut ... \".',\n",
       "  ' Some of them are clean-shaven, but others - especially those of high rank, shave their cheeks but leave a moustache that covers the whole mouth...\".'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mewarnakan rambut saya berambut perang. ', ' i dye my hair blonde. '),\n",
       " (' Sebab utama untuk ini adalah kosmetik: untuk menutup rambut beruban atau putih, menukar warna yang dianggap lebih bergaya atau diinginkan, untuk mengembalikan warna rambut asli setelah ia berubah warna oleh proses pendandan rambut atau pemutihan sinar matahari.',\n",
       "  ' The main reasons for this are cosmetic: to cover gray or white hair, to change to a color regarded as more fashionable or desirable, to restore the original hair color after it has been discolored by hairdressing processes or sun bleaching.'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mempunyai rambut ungu. ', ' i have purple hair. '),\n",
       " (' Rambut mereka berambut perang, tetapi tidak secara semula jadi: mereka memutihkannya, hingga hari ini, secara artifisial, membasuhnya dengan kapur dan menyisirnya kembali dari dahi mereka.',\n",
       "  ' Their hair is blond, but not naturally so: they bleach it, to this day, artificially, washing it in lime and combing it back from their foreheads.'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mempunyai rambut ungu. ', ' i have purple hair. '),\n",
       " (' Diodorus Siculus, sejarawan Yunani menerangkan secara terperinci tentang bagaimana orang Celtic mengecat rambut mereka Blonde: \"Aspek mereka menakutkan ...',\n",
       "  ' Diodorus Siculus, a Greek Historian described in detail of how the Celtic people dyed their hair Blonde: \"Their aspect is terrifying...'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mempunyai rambut ungu. ', ' i have purple hair. '),\n",
       " (' Pewarnaan rambut boleh dilakukan secara profesional oleh pendandan rambut atau secara bebas di rumah.',\n",
       "  ' Hair coloring can be done professionally by a hairdresser or independently at home.'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mewarnakan rambut saya berambut perang. ', ' i dye my hair blonde. '),\n",
       " (' Hari ini, pewarnaan rambut sangat popular, dengan 75% wanita dan 18% lelaki yang tinggal di Copenhagen telah melaporkan menggunakan pewarna rambut menurut kajian oleh University of Copenhagen.',\n",
       "  ' Today, hair coloring is very popular, with 75% of women and 18% of men living in Copenhagen having reported using hair dye according to a study by the University of Copenhagen.'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mempunyai rambut ungu. ', ' i have purple hair. '),\n",
       " (' Mereka bertubuh tinggi, dengan otot-otot yang bergelombang di bawah kulit putih jernih.',\n",
       "  ' They are very tall in stature, with rippling muscles under clear white skin.'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mewarnakan rambut saya berambut perang. ', ' i dye my hair blonde. '),\n",
       " (' Pewarnaan di rumah di Amerika Syarikat mencapai $ 1.9 bilion pada tahun 2011 dan dijangka meningkat menjadi $ 2.2 bilion pada tahun 2016.',\n",
       "  ' At home coloring in the United States reached $1.9 billion in 2011 and is expected to raise to $2.2 billion by 2016.'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mempunyai rambut ungu. ', ' i have purple hair. '),\n",
       " (' Mereka kelihatan seperti setan kayu, rambutnya tebal dan lebat seperti surai kuda.',\n",
       "  \" They look like wood-demons, their hair thick and shaggy like a horse's mane.\"),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mewarnakan rambut saya berambut perang. ', ' i dye my hair blonde. '),\n",
       " (' Diodorus Siculus, sejarawan Yunani menerangkan secara terperinci tentang bagaimana orang Celtic mengecat rambut mereka Blonde: \"Aspek mereka menakutkan ...',\n",
       "  ' Diodorus Siculus, a Greek Historian described in detail of how the Celtic people dyed their hair Blonde: \"Their aspect is terrifying...'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mewarnakan rambut saya berambut perang. ', ' i dye my hair blonde. '),\n",
       " (' Pewarnaan rambut boleh dilakukan secara profesional oleh pendandan rambut atau secara bebas di rumah.',\n",
       "  ' Hair coloring can be done professionally by a hairdresser or independently at home.'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mewarnakan rambut saya berambut perang. ', ' i dye my hair blonde. '),\n",
       " (' Pewarnaan rambut, atau pewarnaan rambut, adalah amalan menukar warna rambut.',\n",
       "  ' Hair coloring, or hair dyeing, is the practice of changing the hair color.'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mewarnakan rambut saya berambut perang. ', ' i dye my hair blonde. '),\n",
       " (' Mereka bertubuh tinggi, dengan otot-otot yang bergelombang di bawah kulit putih jernih.',\n",
       "  ' They are very tall in stature, with rippling muscles under clear white skin.'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mewarnakan rambut saya berambut perang. ', ' i dye my hair blonde. '),\n",
       " (' Rambut mereka berambut perang, tetapi tidak secara semula jadi: mereka memutihkannya, hingga hari ini, secara artifisial, membasuhnya dengan kapur dan menyisirnya kembali dari dahi mereka.',\n",
       "  ' Their hair is blond, but not naturally so: they bleach it, to this day, artificially, washing it in lime and combing it back from their foreheads.'),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mewarnakan rambut saya berambut perang. ', ' i dye my hair blonde. '),\n",
       " (' Mereka kelihatan seperti setan kayu, rambutnya tebal dan lebat seperti surai kuda.',\n",
       "  \" They look like wood-demons, their hair thick and shaggy like a horse's mane.\"),\n",
       " ('Pewarnaan rambut ', 'Hair coloring '),\n",
       " (' saya mewarnakan rambut saya berambut perang. ', ' i dye my hair blonde. '),\n",
       " (' Sebilangan dari mereka bercukur bersih, tetapi yang lain - terutama yang berpangkat tinggi, mencukur pipinya tetapi meninggalkan misai yang menutupi seluruh mulut ... \".',\n",
       "  ' Some of them are clean-shaven, but others - especially those of high rank, shave their cheeks but leave a moustache that covers the whole mouth...\".'),\n",
       " ('Rambut coklat ', 'Brown hair '),\n",
       " (' saya mempunyai rambut coklat. ', ' i have brown hair. '),\n",
       " (' Ini dicirikan oleh tahap eumelanin pigmen gelap yang lebih tinggi dan tahap pheomelanin pigmen pucat yang lebih rendah.',\n",
       "  ' It is characterized by higher levels of the dark pigment eumelanin and lower levels of the pale pigment pheomelanin.'),\n",
       " ('Rambut coklat ', 'Brown hair '),\n",
       " (' saya mempunyai rambut coklat. ', ' i have brown hair. '),\n",
       " (' Rambut coklat biasa terjadi di kalangan penduduk di dunia Barat, terutama di kalangan orang-orang dari Eropah Tengah, Eropah Tenggara, Eropah Timur, Eropah Selatan, Cone Selatan, Amerika Syarikat, dan juga beberapa populasi di Timur Tengah Besar di mana ia beralih dengan lancar ke rambut hitam.',\n",
       "  ' Brown hair is common among populations in the Western world, especially among those from Central Europe, Southeastern Europe, Eastern Europe, Southern Europe, Southern Cone, the United States, and also some populations in the Greater Middle East where it transitions smoothly into black hair.'),\n",
       " ('Rambut coklat ', 'Brown hair '),\n",
       " (' saya mempunyai rambut coklat. ', ' i have brown hair. '),\n",
       " (' Orang dengan rambut coklat sering disebut sebagai si rambut coklat, yang dalam bahasa Perancis adalah bentuk feminin dari \"brunet\", pengurang \"brun\" (coklat, berambut coklat atau berambut gelap).',\n",
       "  ' People with brown hair are often referred to as brunette, which in French is the feminine form of \"brunet,\" the diminutive of \"brun\" (brown, brown-haired or dark-haired).'),\n",
       " ('Rambut coklat ', 'Brown hair '),\n",
       " (' saya mempunyai rambut coklat. ', ' i have brown hair. '),\n",
       " (' Rambut coklat adalah warna rambut manusia kedua yang paling umum, setelah rambut hitam.',\n",
       "  ' Brown hair is the second most common human hair color, after black hair.'),\n",
       " ('Rambut coklat ', 'Brown hair '),\n",
       " (' saya mempunyai rambut coklat. ', ' i have brown hair. '),\n",
       " (' Istilah \"si rambut coklat\" adalah bentuk feminin dari perkataan Perancis \"brunet\", yang merupakan bentuk kecil dari \"brun\" yang bermaksud \"berambut coklat / berambut perang\", yang femininnya adalah \"brune\".',\n",
       "  ' The term \"brunette\" is the feminine form of the French word \"brunet\", which is a diminutive form of \"brun\" meaning \"brown/brown-haired\", the feminine of which is \"brune\".'),\n",
       " ('Rambut coklat ', 'Brown hair '),\n",
       " (' saya mempunyai rambut coklat. ', ' i have brown hair. '),\n",
       " (' Bervariasi dari coklat muda hingga rambut hampir hitam.',\n",
       "  ' It varies from light brown to almost black hair.'),\n",
       " ('Rambut coklat ', 'Brown hair '),\n",
       " (' saya mempunyai rambut coklat. ', ' i have brown hair. '),\n",
       " (' Jalurnya lebih tebal daripada rambut yang cantik tetapi tidak sebesar rambut yang berwarna merah.',\n",
       "  ' Its strands are thicker than those of fair hair but not as much as those of red hair.'),\n",
       " ('Rambut coklat ', 'Brown hair '),\n",
       " (' saya mempunyai rambut coklat. ', ' i have brown hair. '),\n",
       " (' Bentuk \"brun\" (diucapkan) masih sering digunakan di Skotlandia, terutama di luar bandar, dan juga kata untuk \"coklat\" dalam bahasa Skandinavia.',\n",
       "  ' The form \"brun\" (pronounced ) is still commonly used in Scotland, particularly in rural areas, and is also the word for \"brown\" in the Scandinavian languages.'),\n",
       " ('Merah ', 'Red '),\n",
       " (' warna kegemaran saya adalah merah. ', ' my favorite color is red. '),\n",
       " (' Merah berkisar dari warna merah tua berwarna kuning tua dan vermillion hingga merah kemerahan kebiruan, dan berbeza warna dari merah jambu merah pucat hingga burgundy merah gelap.',\n",
       "  ' Reds range from the brilliant yellow-tinged scarlet and vermillion to bluish-red crimson, and vary in shade from the pale red pink to the dark red burgundy.'),\n",
       " ('Rambut coklat ', 'Brown hair '),\n",
       " (' saya mempunyai rambut coklat. ', ' i have brown hair. '),\n",
       " (' Selain itu, rambut coklat adalah perkara biasa di kalangan Orang Asli Australia dan orang Melanesia.',\n",
       "  ' Additionally, brown hair is common among Australian Aborigines and Melanesians.'),\n",
       " ('Merah ', 'Red '),\n",
       " (' warna kegemaran saya adalah merah. ', ' my favorite color is red. ')]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "i = random.randint(0, len(x) - 100)\n",
    "list(zip(x[i: i + 100], y[i: i + 100]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "X.extend(x)\n",
    "Y.extend(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Sertakan Gmail, Google Docs, Google+, YouTube dan Picasa'"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('../texts.json.translate') as fopen:\n",
    "    news = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(news)):\n",
    "    if news[i][0] != news[i][1]:\n",
    "        X.append(news[i][1])\n",
    "        Y.append(news[i][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "for file in glob('../dataset-*.json.translate')[:6]:\n",
    "    with open(file) as fopen:\n",
    "        news = json.load(fopen)\n",
    "        \n",
    "    for i in range(len(news)):\n",
    "        if news[i][0] != news[i][1]:\n",
    "            X.append(news[i][1])\n",
    "            Y.append(news[i][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5770424/5770424 [02:09<00:00, 44561.05it/s]\n"
     ]
    }
   ],
   "source": [
    "filtered_X, filtered_Y = [], []\n",
    "\n",
    "for i in tqdm(range(len(X))):\n",
    "    X[i] = cleaning(X[i])\n",
    "    Y[i] = cleaning(Y[i])\n",
    "    if len(X[i]) and len(Y[i]):\n",
    "        filtered_X.append(X[i])\n",
    "        filtered_Y.append(Y[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5770283/5770283 [00:02<00:00, 2152627.76it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "4.574004429245497"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "count, ids = 0, []\n",
    "for i in tqdm(range(len(filtered_X))):\n",
    "    if filtered_X[i] == filtered_Y[i]:\n",
    "        count += 1\n",
    "        ids.append(i)\n",
    "        \n",
    "count / len(filtered_X) * 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 5770283/5770283 [00:07<00:00, 778648.43it/s] \n"
     ]
    }
   ],
   "source": [
    "uniques = set()\n",
    "for i in tqdm(range(len(filtered_X))):\n",
    "    s = f'{filtered_X[i]} [EENNDD] {filtered_Y[i]}'\n",
    "    uniques.add(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4004517"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uniques = list(uniques)\n",
    "len(uniques)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 4004517/4004517 [02:37<00:00, 25414.45it/s]\n"
     ]
    }
   ],
   "source": [
    "X, Y = [], []\n",
    "for i in tqdm(range(len(uniques))):\n",
    "    x, y = uniques[i].split(' [EENNDD] ')\n",
    "    if check(x) == check(y):\n",
    "        continue\n",
    "    X.append(x)\n",
    "    Y.append(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3885323"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Dan sesiapa yang ringan timbangan amal baiknya, maka merekalah orang-orang yang merugikan dirinya sendiri; mereka kekal di dalam neraka Jahannam -',\n",
       "  'But those whose scales are light - those are the ones who have lost their souls, [being] in Hell, abiding eternally.'),\n",
       " ('\"Mukim ini sebelum ini pernah diwartakan sebagai kawasan wabak dan ia berlaku sekali lagi, apapun kita perlu ambil tindakan pengawasan sepertimana kita lakukan sekarang,\" katanya.',\n",
       "  '\"This district was previously gazetted as an outbreak and it is happening again, no matter what we need to do, we need to take the necessary precautions as we do now,\" he said.'),\n",
       " ('\"Dalam serbuan berkenaan, kita turut merampas satu tiub straw dipercayai mengandungi dadah jenis heroin seberat 0.7 gram, satu paket ketulan kristal dipercayai syabu seberat 0.6 gram, satu paket dipercayai heroin seberat 37.2 gram dan paket plastik dipercayai syabu seberat 6.9 gram.',\n",
       "  '\"In the raid, we also seized a straw tube believed to contain 0.7 grams of heroin, a pack of crystal syrup believed to weigh 0.6 grams, a pack of heroin believed to weigh 37.2 grams and a plastic package of syabu weighing 6.9 grams.'),\n",
       " ('Kisah ini berasal dari terbitan Fortune pada 25 Februari 2013.',\n",
       "  'This story is from the February 25, 2013 issue of Fortune.'),\n",
       " ('Cummins menyatakan bahawa anjing bela jinak, sungguhpun dijumpai dalam hampir semua masyarakat manusia, dipinggirkan oleh ahli antropologi.',\n",
       "  'Cummins states that puppies, although found in almost all human societies, are marginalized by anthropologists.'),\n",
       " ('Formula asal ialah: di mana A ialah persiaran maksimum seismograf Wood-Anderson, fungsi empirik A bergantung hanya pada jarak epipusat stesen tersebut, formula_2.',\n",
       "  \"The original formula is: where A is the maximum displacement of the Wood-Anderson seismograph, the empirical function A depends only on the station's epicenter distance, formula_2.\"),\n",
       " ('Saya suka sekali-sekala keluar. Saya suka berkhemah dan sebagainya. Pergi ke tasik, dll',\n",
       "  'I enjoy going out occassionally. I like camping and such. Going to the lake, etc'),\n",
       " ('Dua loji yang dipandu gas, Stesen Tenaga Pasir Gudang dengan 210 MW dan Stesen Kuasa Sultan Iskandar dengan 269 MW, terletak di Pasir Gudang.',\n",
       "  'Two gas-powered plants, 210 MW Pasir Gudang Station and 269 MW Sultan Iskandar Power Station, are located in Pasir Gudang.'),\n",
       " ('Kugiran ini terkenal dengan irama baharu dan muzik gabungan unsur budaya dengan rentak moden.',\n",
       "  'The band is known for its new rhythms and the music of a mix of cultural elements with a modern rhythm.'),\n",
       " ('Dang, saya bertaruh. Adakah anda suka bunga?',\n",
       "  'Dang, I bet. Do you like flowers?')]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(zip(X[-10:], Y[-10:]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_X, test_X, train_Y, test_Y = train_test_split(Y, X, test_size = 0.02)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Jesetice Jesetice is a village and municipality located in Benesov region, Central Bohemian province, Czech Republic.',\n",
       "  'Jesetice Jesetice merupakan sebuah kampung dan kawasan perbandaran yang terletak di wilayah Benesov, wilayah Bohemia Tengah, Republik Czech.'),\n",
       " ('I see. So what else do you do other than being a family man?',\n",
       "  'Saya faham. Jadi apa lagi yang anda lakukan selain menjadi lelaki keluarga?'),\n",
       " ('North Felda Lawin Mosque Nurul Iman Mosque is a mosque located in North Felda Lawin, Gerik, Perak.',\n",
       "  'Masjid Felda Lawin Utara Masjid Nurul Iman merupakan sebuah masjid yang terletak di Felda Lawin Utara, Gerik, Perak.'),\n",
       " ('Women watching the progress from across the water in Nov.1966. Saving the Palace of Fine Arts, meant partially demolishing it and then reconstruction with better materials. Women watching the progress from across the water in Nov.1966. Saving the Palace of Fine Arts, meant partially demolishing it and then reconstruction with better materials. Photo shot 10/20/1966 Photo ran 10/30/1966. Saving the Palace of Fine Arts, meant partially demolishing it and then reconstruction with better materials. Photo shot 10/20/1966 Photo ran 10/30/1966. These are the columns at the North end of Palace of Fine Arts being reconstructed in Aug.1965. These are the columns at the North end of Palace of Fine Arts being reconstructed in Aug.1965. Saving the Palace of Fine Arts, meant partially demolishing it and then reconstruction with better materials. Photo shot 10/20/1966 Photo ran 10/30/1966. Saving the Palace of Fine Arts, meant partially demolishing it and then reconstruction with better materials.',\n",
       "  'Wanita menyaksikan kemajuan dari seberang perairan pada Nov.1966. Menyimpan Istana Seni Halus, bermaksud merobohkannya sebagian dan kemudian membina semula dengan bahan yang lebih baik. Wanita menyaksikan kemajuan dari seberang perairan pada Nov.1966. Menyimpan Istana Seni Halus, bermaksud merobohkannya sebagian dan kemudian membina semula dengan bahan yang lebih baik. Tangkapan gambar 10/20/1966 Foto berlari 10/30/1966. Menyimpan Istana Seni Halus, bermaksud merobohkannya sebagian dan kemudian membina semula dengan bahan yang lebih baik. Tangkapan gambar 10/20/1966 Foto berlari 10/30/1966. Ini adalah tiang-tiang di hujung utara Istana Seni Halus yang dibina semula pada Ogos.1655. Ini adalah tiang-tiang di hujung utara Istana Seni Halus yang dibina semula pada Ogos.1655. Menyimpan Istana Seni Halus, bermaksud merobohkannya sebagian dan kemudian membina semula dengan bahan yang lebih baik. Tangkapan gambar 10/20/1966 Foto berlari 10/30/1966. Menyimpan Istana Seni Halus, bermaksud merobohkannya sebagian dan kemudian membina semula dengan bahan yang lebih baik.'),\n",
       " ('In the Minority Tangon, it is said that when the ancestors of the Kadazan-Hamlets settled in the early settlement of Nunuk Ragang, an attack came from the land in the form of small creatures.',\n",
       "  'Dalam Tangon Minorit, dikatakan bahawa pada waktu nenek moyang Kadazan- Dusun tinggal dalam penempatan awal di Nunuk Ragang, ada serangan yang muncul dari tanah dalam bentuk mahkluk-mahluk kecil.'),\n",
       " ('Zaskia is a student of the Faculty of Psychology of the University of Paramadina.',\n",
       "  'Zaskia merupakan seorang mahasiswa dari Fakulti Psikologi Universiti Paramadina.'),\n",
       " ('Q: The issue of traffic congestion is sometimes a determining factor as travel times will double, two or three times more than usual.',\n",
       "  'S: Isu kesesakan lalu lintas kadang-kala menjadi faktor penentu kerana masa perjalanan akan berganda, dua atau tiga kali berbanding biasa.'),\n",
       " ('I have an omelet for breakfast each morning, I use 6 eggs.',\n",
       "  'Saya mempunyai telur dadar untuk sarapan setiap pagi, saya menggunakan 6 biji telur.'),\n",
       " ('they had hair down to here,\" Mr. Kim said, pointing to his waist, as his eyes lit up at the memory. He added that some clowns who did manage to marry would sometimes leave their wives for fellow clowns. Mr. Kim himself married and had one son. He said he, too, had biris during his life, though he said the relations had not been sexual. \"Relations between men were very sincere and genuine,\" Mr. Kim said. \"It was an amazing, remarkable relationship, much closer than anything between a husband and wife\".',\n",
       "  'rambut mereka sampai ke sini, \"kata Mr. Kim sambil menunjuk ke pinggangnya, sambil matanya menyala di ingatan. Dia menambah bahawa beberapa badut yang berjaya menikah kadang-kadang meninggalkan isteri mereka untuk sesama badut. Mr Kim sendiri telah berkahwin dan mempunyai seorang anak lelaki. Dia mengatakan bahawa dia juga mempunyai biris selama hidupnya, walaupun dia mengatakan hubungan itu tidak seksual. \"Hubungan antara lelaki sangat ikhlas dan tulus,\" kata Mr Kim. \"Itu luar biasa, hubungan yang luar biasa, jauh lebih dekat daripada apa-apa antara suami dan isteri \".'),\n",
       " ('It will be difficult for the enemy to come from above the ground.',\n",
       "  'Ia akan memberi kesukaran kepada musuh yang datang dari atas tanah.')]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(zip(train_X[-10:], train_Y[-10:]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('He said: O my people! serve Allah, you have no god other than Him; will you not then guard (against evil)?',\n",
       "  'Sembahlah kamu akan Allah, (sebenarnya) tidak ada Tuhan bagi kamu selain daripadaNya. Oleh itu, tidakkah kamu mahu bertaqwa kepadaNya? \"'),\n",
       " (\"JOHOR BAHRU: The United Indigenous People's Party (ARMADA) Johor People's Party (BERSATU) Johor reported on the Central Police Station here today on three Facebook accounts (FBs) that have slandered Chief Minister Datuk Osman Sapian.\",\n",
       "  'JOHOR BAHRU: Angkatan Bersatu Anak Muda (ARMADA) Parti Pribumi Bersatu Malaysia (BERSATU) Johor membuat laporan di Balai Polis Sentral di sini, hari ini, berhubung tiga akaun laman sosial Facebook (FB) yang menfitnah Menteri Besar, Datuk Osman Sapian.'),\n",
       " ('PHOENIX -- Brittney Griner raised the WNBA Defensive Player of the Year trophy just before tipoff and then showed why she received the honor. Griner had 18 points and a WNBA playoff-record 11 blocks to lead the Phoenix Mercury to an 88-55 win over the Tulsa Shock on Thursday night in the opener of the Western Conference semifinals. Griner, selected the league\\'s top defender for the second straight year, also had eight rebounds. \"I try to go for that,\" Griner said of the defensive player award. \"That\\'s the one I try to get. I just want to show why I won it\". Griner, who led the league in blocks with 105 this season, broke her own single-game postseason record and shot 5 for 7 from the field and 8 for 9 on free throws. DeWanna Bonner added 15 points for the defending champion Mercury. Phoenix, the No.2 seed in the West, hasn\\'t lost a first-round playoff series since 2000 and is 10-1 all-time in a playoff series after winning the first game.',\n",
       "  'PHOENIX - Brittney Griner menaikkan trofi Pemain Pertahanan Terbaik WNBA tepat sebelum tamat dan kemudian menunjukkan mengapa dia mendapat penghormatan itu. Griner memperoleh 18 mata dan 11 blok playoff WNBA untuk memimpin Phoenix Mercury meraih kemenangan 88-55 ke atas Tulsa Shock pada Khamis malam dalam pembukaan separuh akhir Persidangan Barat. Griner, yang terpilih sebagai pertahanan utama liga untuk tahun kedua berturut-turut, juga mendapat lapan lantunan. \"Saya berusaha untuk melakukannya,\" kata Griner mengenai penghargaan pemain pertahanan. \"Itulah yang saya cuba dapatkan. Saya hanya ingin menunjukkan mengapa saya memenanginya\". Griner, yang memimpin liga di blok dengan 105 musim ini, memecahkan rekod pasca musim permainannya sendiri dan menembak 5 untuk 7 dari lapangan dan 8 untuk 9 dengan lontaran percuma. DeWanna Bonner menambah 15 mata untuk juara bertahan, Mercury. Phoenix, pilihan nombor dua di Barat, tidak kalah dalam siri playoff pusingan pertama sejak tahun 2000 dan 10-1 sepanjang masa dalam siri playoff setelah memenangi pertandingan pertama.'),\n",
       " ('This can be felt when the government has agreed to repeal some of the laws that restrict media freedom such as the Printing Press and Sedition Act and also amend the Malaysian Communications and Multimedia Act.',\n",
       "  'Ini dapat dirasai apabila kerajaan bersetuju menghapuskan beberapa akta yang menyekat kebebasan media seperti Akta Mesin Cetak dan Pener Akta Hasutan dan juga meminda Akta Komunikasi dan Multimedia Malaysia.'),\n",
       " ('Founded in 1889, it is the only hospital operated outside of the metropolitan area of Sydney by the South Sydney Local Health District.',\n",
       "  'Ditubuhkan pada tahun 1889, ia adalah satu-satunya hospital yang dikendalikan di luar kawasan metropolitan Sydney oleh Daerah Kesihatan Tempatan Sydney Barat Selatan.'),\n",
       " ('\"However, the government can still reach its original target of fiscal deficit at 2.8 per cent of GDP 2018, considering the possibility of higher dividend yields from government-linked entities and a pragmatic approach to implementing some of Pakatan Harapan\\'s 100-day pledge policy changes,\" he said in a statement today.',\n",
       "  '\"Bagaimanapun, kerajaan masih boleh mencapai sasaran asal defisit fiskal pada 2.8 peratus daripada KDNK 2018, mengambil kira kemungkinan hasil dividen lebih tinggi daripada entiti berkaitan kerajaan dan pendekatan pragmatik bagi melaksanakan beberapa perubahan dasar ikrar 100 hari Pakatan Harapan,\" katanya dalam kenyataan hari ini.'),\n",
       " ('I know I can do that because this is on the line.',\n",
       "  'Ini boleh dibuat kerana garis lurus melalui titik ini.'),\n",
       " ('Premier Kathleen Wynne\\'s government is slowing down the phase-in of its controversial new Ontario Retirement Pension Plan (ORPP) to ease the transition for small and medium-sized businesses. Under the new plan, only large businesses with more than 500 employees will have to begin contributing immediately when the ORPP begins in 2017. Medium-sized firms -- those with 50-499 employees -- will contribute more starting in 2018, while small businesses won\\'t have to pay the increase until 2019. The self-employed will also be required to contribute to the ORPP, as they do to CPP, starting in 2019. The Ontario plan will require companies to pay premiums of 1.9 per cent of salary for each employee, up to $1,643 a year, and workers will pay an equal amount. Workers and companies with \"comparable\" workplace pension plans will be exempt. Wynne said the Ontario Retirement Pension Plan (ORPP) is needed because:.',\n",
       "  'Kerajaan Perdana Menteri Kathleen Wynne melambatkan fasa masuk Pelan Pencen Persaraan Ontario (ORPP) yang baru yang kontroversial untuk memudahkan peralihan untuk perniagaan kecil dan sederhana. Di bawah rancangan baru, hanya perniagaan besar dengan lebih daripada 500 pekerja yang harus mula menyumbang segera apabila ORPP bermula pada tahun 2017. Syarikat bersaiz sederhana - yang mempunyai 50-499 pekerja - akan menyumbang lebih banyak mulai tahun 2018, sementara perniagaan kecil tidak perlu membayar kenaikan sehingga 2019. Pekerja sendiri juga diminta untuk menyumbang kepada ORPP, seperti yang mereka lakukan kepada CPP, mulai tahun 2019. Pelan Ontario akan menghendaki syarikat membayar premium sebanyak 1.9 peratus daripada gaji untuk setiap pekerja, sehingga $ 1,643 setahun, dan pekerja akan membayar jumlah yang sama. Pekerja dan syarikat dengan rancangan pencen tempat kerja \"setanding\" akan dikecualikan. Wynne mengatakan Pelan Pencen Persaraan Ontario (ORPP) diperlukan kerana:.'),\n",
       " ('My eyes are hazel green.', 'Mata saya berwarna hijau hazel.'),\n",
       " ('Felsoszentmarton Felsoszentmarton is a village located in the Baranya region of Hungary.',\n",
       "  'Felsoszentmarton Felsoszentmarton merupakan sebuah kampung yang terletak di wilayah Baranya, Hungary.')]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(zip(train_X[:10], train_Y[:10]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('ORCID was started on October 16, 2012.',\n",
       "  'ORCID telah dimulakan pada 16 Oktober 2012.'),\n",
       " ('Although not many people think of widows, thank God I have done this single mom\\'s job well, \"she said.',\n",
       "  'Meskipun tidak ramai orang beranggapan baik dengan janda, Alhamdulillah saya telah menggalas tugas ibu tunggal ini dengan baik,\" ujarnya lagi.'),\n",
       " (\"Hammersmith & City flow map before the new Wood Lane station opened and before Shepherd's Bush station (Hammersmith & City) was named Shepherd's Bush Market.\",\n",
       "  \"Peta aliran Hammersmith & City sebelum stesen baru Wood Lane dibuka dan sebelum stesen Shepherd's Bush (Hammersmith & City) dinamakan menjadi Shepherd's Bush Market.\"),\n",
       " (\"But I wasn't joking around.\", 'Tapi saya bukan membuat lawak.'),\n",
       " ('The leaves are 50-150 cm long, tapered or wide, with a width of 3.5 to 20 cm wide.',\n",
       "  'Daunnya sepanjang 50-150 cm panjang, berbentuk tirus atau lebar, dengan kelebaran 3.5 sehingga 20 cm lebar.'),\n",
       " ('File transfer already started', 'Pemindahan fail sudah bermula'),\n",
       " ('In 2015, a new residential college was built to accommodate the growing number of students.',\n",
       "  'Pada 2015, sebuah kolej kediaman baharu dibina bagi menampung bilangan pelajar yang semakin bertambah.'),\n",
       " ('The four-cylinder engine produces a pulse blast at 80 Hz, producing speeds of up to 200 pounds (890 newton).',\n",
       "  'Enjin terdiri daripada empat tiub menghasilkan letupan denyut pada frekuensi80 Hz, menghasilkan tujahan sehingga 200 paun (890 newton).'),\n",
       " ('The security of all people regardless of religion must be guaranteed especially in their respective places of worship.',\n",
       "  'Keamanan semua orang tanpa mengira agama mesti dijamin terutama dalam ruang rumah ibadat masing-masing.'),\n",
       " ('Three Chelsea supporters were barred from entering the Eden Arena',\n",
       "  'Tiga penyokong Chelsea dihalang memasuki Eden Arena')]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(zip(test_X[-10:], test_Y[-10:]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.utils import shuffle\n",
    "\n",
    "train_X, train_Y = shuffle(train_X, train_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dataset-en-to-ms.json', 'w') as fopen:\n",
    "    json.dump({'train_X': train_X, 'train_Y': train_Y, 'test_X': test_X, 'test_Y': test_Y}, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mkdir: cannot create directory ‘train-en’: File exists\n",
      "mkdir: cannot create directory ‘test-en’: File exists\n"
     ]
    }
   ],
   "source": [
    "!mkdir train-en\n",
    "!mkdir test-en"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train-en/left.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(train_X))\n",
    "    \n",
    "with open('train-en/right.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(train_Y))\n",
    "    \n",
    "with open('test-en/left.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(test_X))\n",
    "    \n",
    "with open('test-en/right.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(test_Y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train-en/\n",
      "train-en/left.txt\n",
      "train-en/right.txt\n",
      "test-en/\n",
      "test-en/left.txt\n",
      "test-en/right.txt\n"
     ]
    }
   ],
   "source": [
    "!tar -czvf train-en-ms.tar.gz train-en\n",
    "!tar -czvf test-en-ms.tar.gz test-en"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3807616"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(\"You know how the story goes. Two heterosexual, almost-certainly white people are struggling in their lives. They meet each other, get together, break up, get together again and find true happiness and completion in each others arms. Finally, they can start living a normal life! Rom-coms can be enjoyable escapism, but it's a tale that is told a bit too often, one that is too lacking in diversity, too reliant on gender stereotypes and too concerned with selling us a brand of love that is impossible to live up to: a 2008 study at Heriot Watt University found that rom-coms have a negative effect on relationships, making us chase unobtainable standards of love. In the process of writing my new play Ross & Rachel, which confronts the myths of modern love and opens at the Edinburgh Festival Fringe this August, I've had to think a lot about romance in fiction from Romeo and Juliet to Pride and Prejudice right up to Notting Hill. Why do we continue to keep telling the same story?\",\n",
       "  'Anda tahu bagaimana kisahnya. Dua orang kulit putih heteroseksual, hampir pasti bergelut dalam hidup mereka. Mereka bertemu satu sama lain, berkumpul, berpisah, berkumpul kembali dan mencari kebahagiaan dan penyelesaian yang sebenarnya dalam satu sama lain. Akhirnya, mereka dapat mula menjalani kehidupan yang normal! Rom-com boleh menjadi escapism yang menyeronokkan, tetapi ini adalah kisah yang terlalu sering diceritakan, kisah yang terlalu kurang dalam kepelbagaian, terlalu bergantung pada stereotaip jantina dan terlalu peduli dengan menjual jenama cinta yang mustahil untuk kita hadapi : kajian tahun 2008 di Heriot Watt University mendapati bahawa rom-com mempunyai kesan negatif terhadap hubungan, menjadikan kita mengejar standard cinta yang tidak dapat dicapai. Dalam proses penulisan drama baru saya Ross & Rachel, yang menghadap mitos cinta moden dan dibuka di Edinburgh Festival Fringe pada bulan Ogos ini, saya harus banyak berfikir tentang percintaan dalam fiksyen dari Romeo dan Juliet hingga Pride and Prejudice betul hingga ke Notting Hill. Mengapa kita terus menceritakan kisah yang sama?'),\n",
       " ('02/24/2016 AT 11:35 AM EST. wasn\\'t totally on board for. \"I did not want him to really go to New York. It\\'s scary,\" Kardashian told hosts Ross Mathews and Garcelle Beauvais during an appearance on., traveled with Kardashian to the Yeezy Season 3 presentation - his. From left: Khloe Kardashian, Ross Mathews and Garcelle Beauvais. \"I had a ton of anxiety,\" Kardashian said of taking her estranged husband. and to the event. \"I\\'m so protective. Your first time out at Madison Square Garden? Like, I just felt like, \\'Is it too much stimulus for his brain? Who knows? \\' \"., Kardashian said, \"Everyone cleared him\" for the trip. \"He was good to go and it meant so much to Lamar,\" she explained. \"And I thought it was so dope of Kanye to want to walk out with Lamar. And just that feeling for Lamar to do that on his own\". Kardashian, 31, also noted that brother-in-law West was \"really instrumental\" in Odom\\'s recovery. \"Kanye would come to the hospital and.',\n",
       "  '02/24/2016 Pukul 11:35 PG EST. tidak betul-betul dalam perjalanan untuk. \"Saya tidak mahu dia benar-benar pergi ke New York. Ini menakutkan,\" kata Kardashian kepada tuan rumah Ross Mathews dan Garcelle Beauvais semasa penampilan., Melakukan perjalanan dengan Kardashian ke persembahan Yeezy Musim 3 - miliknya. Dari kiri: Khloe Kardashian, Ross Mathews dan Garcelle Beauvais. \"Saya mengalami banyak kegelisahan,\" kata Kardashian ketika mengambil suaminya yang terpisah. dan ke majlis itu. \"Saya sangat pelindung. Kali pertama anda keluar di Madison Square Garden? Seperti, saya hanya merasa, \\'Adakah terlalu banyak rangsangan untuk otaknya? Siapa yang tahu?\\'\"., Kardashian berkata, \"Semua orang membersihkannya\" untuk perjalanan. \"Dia senang pergi dan itu sangat berarti bagi Lamar,\" jelasnya. \"Dan aku rasa sangat Kanye ingin keluar bersama Lamar. Dan perasaan itu untuk Lamar melakukan itu sendiri\". Kardashian, 31, juga menyatakan bahawa saudara ipar Barat \"sangat berperanan\" dalam pemulihan Odom. \"Kanye akan datang ke hospital dan.'),\n",
       " ('Click-Drag to insert an anchor on the path',\n",
       "  'Klik-Seret untuk sisip sauh ke laluan'),\n",
       " ('Print Preview', 'Pratonton Cetakan'),\n",
       " ('I like The Beatles. They have some good songs. Do you agree?',\n",
       "  'Saya suka The Beatles. Mereka mempunyai beberapa lagu yang bagus. Adakah anda bersetuju?'),\n",
       " ('Search Compiz Core Options', 'Gelintar Pilihan Compiz Core'),\n",
       " (\"The only assistance he had was from the lizards who lay in the darkness on the wall outside the windows. Darting their heads forward, they nonchalantly snapped up the termites by their ones and twos, calmly munching them in their long thin jaws. By the time I arrived in the kitchen there were so many termites in the room that it was almost dark. ''That meat smells disgusting,'' I said. ''It is disgusting,'' he said.\",\n",
       "  \"Satu-satunya bantuan yang dia dapat adalah dari kadal yang terbaring dalam kegelapan di dinding di luar tingkap. Sambil menggerakkan kepala ke depan, mereka dengan santai mengambil anai-anai oleh mereka dan kedua-duanya, dengan tenang mengunyahnya di rahang panjang mereka yang nipis. Pada masa saya tiba di dapur terdapat banyak anai-anai di dalam bilik sehingga hampir gelap. '' Daging itu berbau menjijikkan, '' kataku. '' Itu menjijikkan, '' katanya.\"),\n",
       " ('White House chief economic advisor Larry Kudlow, in an interview with CNBC on Tuesday (22/1), denied having a meeting scheduled for this week.',\n",
       "  'Kepala penasihat ekonomi Gedung Putih Larry Kudlow, dalam sebuah wawancara dengan CNBC pada Selasa (22/1), membantah adanya pertemuan yang dijadwalkan untuk minggu ini.'),\n",
       " (\"Despite the 1MDB corruption, the Hajj Fund and Felda gained public attention and the BN government saw the people's support decline.\",\n",
       "  'Namum korupsi 1MDB, Tabung Haji dan Felda mendapat perhatian umum dan pemerintah BN melihat sokongan rakyat menurun.'),\n",
       " ('_Add', 'T_ambah+mdraid-add')]"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(zip(test_X[:10], test_Y[:10]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
